import numpy as np
import math
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyecharts import options as opts
from pyecharts.charts import Map
from pyecharts.faker import Collector, Faker
from pyecharts.datasets import register_url
import plotly.figure_factory as ff
import time
start0 = time.process_time()
%load_ext dmind
%dmindheader
%%dmind markdown right
# Three data sets
## Census
### Linear Regression
#### Data cleanļ¼delete the inf and nanļ¼+ļ¼some meaningless columnļ¼
#### Feature scalingļ¼min_maxļ¼+ļ¼Z-scoreļ¼
#### Data visualizationļ¼the distribution of house price in USAļ¼
#### Data partition / move it to the last column +change dataset into svm format
## Adult
### Classification
#### Data clean ļ¼delete theāļ¼āļ¼+ļ¼one_hot encodingļ¼
#### Feature scalingļ¼min_maxļ¼+ļ¼Z-scoreļ¼
#### Data visualizationļ¼relationship between age and income+sexualļ¼
#### Data partition / change dataset into libsvm and svmlight format
## CPU
### Linear Regression
#### Data cleanļ¼delete the inf and nanļ¼+ļ¼some meaningless columnļ¼
#### Feature scalingļ¼min_maxļ¼+ļ¼Z-scoreļ¼
#### Data visualizationļ¼linear relationship between usr and other featuresļ¼
#### Data partition / move it to the last column +change dataset into svm format
# First,I want to use svmlight package to build svm model and solve the problem. Unfortunately, it hard to install the package and i used the
## code shown below
@@
# !git clone https://github.com/mblondel/svmlight-loader
# %cd svmlight-loader
# !python setup.py build && python setup.py install
# import svmlight
@@
## Actually,you can also find other website(github) or official website to install this package. For some reasons, I failed. In my opinion,
## it has big relationship with Visual C++ and I donot have enough time to fix it. SO,I downloaded the exe file and used powershell to establish svmlight model.In the code below, you can obtain the function "generate_data_file" !
@@
## Generate data in svmlight package format(CPU)
#QQ=[]
#for j in range(len(CPUtrain_x_array)):
# for i in range('column number(CPU_clean)-1'):
# QQ.append((i+1,CPUtrain_x_array[j][i]))
#QQ2=list(np.linspace(0,(len(CPUtrain_x_array)-1) * 'column number(CPU_clean)-1' , num=len(CPUtrain_x_array), endpoint=True, retstep=False, dtype=int))
#QQ3=[]
#for(i,j) in zip(range(len(CPUtrain_x_array)),QQ2):
# QQ3.append((CPUtrain_y[i],QQ[j:j+'column number(CPU_clean)-1']))
#QQ3
@@
Census_Dataset= pd.read_excel('C:\\Users\\sunjin\\Desktop\\MACHINE LEARNING\\Data\\census-house\\census-house\\Census_Dataset.xlsx', header=0)
Census_Dataset.info()
Census_Dataset.head()
Meaningless_Census_Dataset = Census_Dataset.applymap(lambda x: str(x).strip() == '?')
Meaningless_Census_Dataset.sum()
adult = pd.read_csv('C:\\Users\\sunjin\\Desktop\\machine learning group project\\adult.data', header=None, delimiter=',\s', engine='python')
adult.columns = ['Age', 'Workclass', 'Weight', 'Education', 'Education Num',
'Marital', 'Occupation', 'Relationship', 'Race', 'Sex',
'C-Gain', 'C-Loss', 'Hours', 'Country', 'Income']
adult.info()
adult.head()
adult_test = pd.read_csv('C:\\Users\\sunjin\\Desktop\\machine learning group project\\adult.test', header=None, delimiter=',\s', engine='python', skiprows=1)
adult_test.columns = ['Age', 'Workclass', 'Weight', 'Education', 'Education Num',
'Marital', 'Occupation', 'Relationship', 'Race', 'Sex',
'C-Gain', 'C-Loss', 'Hours', 'Country', 'Income']
adult_test['Income'] = adult_test['Income'].str[:-1]# delete ā<=50K.ā theā.ā
adult_test.info()
adult_test.head()
Meaningless_adult = adult.applymap(lambda x: str(x).strip() == '?')
Meaningless_adult_test = adult_test.applymap(lambda x: str(x).strip() == '?')
Meaningless_adult.sum()# show how many ? in the dataset
Meaningless_adult_test.sum()
CPU = pd.read_csv('C:\\Users\\sunjin\\Desktop\\MACHINE LEARNING\\Data\\comp-activ\\Dataset.data', header=None, delimiter='\s+', engine='python',skiprows=1)
CPU.columns= ['time','lread','lwrite', 'scall', 'sread', 'swrite','fork','exec','rchar','wchar','pgout','ppgout','pgfree','pgscan','atch','pgin','ppgin','pflt','vflt','runqsz','runocc','freemem','freeswap','usr','sys' ,'wio','idle']
CPU.info()
CPU.head()
Census_clean=Census_Dataset.drop(['State','Code','H23.A','H23.C','H24','P4.1','P4.2','P4.3','P4.4','H4.1','H4.2','H4.3','H4.4','H35.1'], axis = 1)
Census_clean[np.isnan(Census_clean)] = 0 #delete nan
Census_clean[np.isinf(Census_clean)] = 0 #delete inf
Census_clean.info()
Census_clean.head()
adult_clean=adult[adult.applymap(lambda x: str(x).strip() == '?').apply(np.sum, axis=1) == 0].reset_index(drop=True)# Delete '?'
adult_test_clean=adult_test[adult_test.applymap(lambda x: str(x).strip() == '?').apply(np.sum, axis=1) == 0].reset_index(drop=True)
combine_adultclean=pd.concat([adult_clean,adult_test_clean],axis=0).drop(['Education'], axis = 1)#Data combination
Onehotc_adult_clean=pd.get_dummies(combine_adultclean,dtype='int8').drop(['Income_<=50K'], axis = 1)
Onehotc_adult_clean['Income_>50K']=Onehotc_adult_clean['Income_>50K'].replace(0, -1)
Onehotc_adult_clean.head()
CPU_clean=CPU.drop(['time'], axis = 1)
CPU_clean[np.isnan(CPU_clean)] = 0 #delete nan
CPU_clean[np.isinf(CPU_clean)] = 0 #delete inf
CPU_clean.info()
CPU_clean.head()
from sklearn.preprocessing import MinMaxScaler
Census_clean.iloc[:, :]=MinMaxScaler().fit_transform(Census_clean.iloc[:, :])
display(Census_clean.mean())
display(Census_clean.var())
Census_clean.head()
from sklearn.preprocessing import MinMaxScaler
Onehotc_adult_clean.iloc[:, :6]=MinMaxScaler().fit_transform(Onehotc_adult_clean.iloc[:, :6])
display(Onehotc_adult_clean.iloc[:, :6].mean())
display(Onehotc_adult_clean.iloc[:, :6].var())
Onehotc_adult_clean.head()
from sklearn.preprocessing import MinMaxScaler
CPU_clean.iloc[:, :]=MinMaxScaler().fit_transform(CPU_clean.iloc[:, :])
display(CPU_clean.mean())
display(CPU_clean.var())
CPU_clean.head()
Censussample=Census_Dataset.drop_duplicates(['State']).sort_values(by=['State'])
data = pd.read_excel('C:\\Users\\sunjin\\Desktop\\MUST\\MUSTāCourseāPython tool\\Python presentation\\python data\\test2.xlsx')
data=data.drop_duplicates(['fips']).sort_values(by=['fips'])
data = data .drop(data [data .fips == 3].index)
data=data.drop(data.index[[-1, -2, -3,-4 ,8]], axis=0)
statename=list(data['state'])
HUprice=list(Censussample['H23.B'])
list1 = [[statename[i],HUprice[i]] for i in range(len(statename))]
map_1 = Map(init_opts=opts.InitOpts(width="1500px", height="600px"))
map_1.add("U.S. House Prices", list1, maptype="ē¾å½")
map_1.set_global_opts(
visualmap_opts=opts.VisualMapOpts(min_=0,max_=250000, is_piecewise=True, split_number=15),
legend_opts=opts.LegendOpts(is_show=True),
)
map_1.render_notebook()
county_fips= pd.read_excel('C:\\Users\\sunjin\\Desktop\\fips-codes-master\\county_fips_master.xlsx')
county_fips=county_fips.iloc[:,[0,8]]
county_fips['state'] = county_fips['state'].fillna(0).astype(np.int64)
county_fips2=county_fips.drop_duplicates(['state']).sort_values(by=['state'])
county_fips2=county_fips2.drop(county_fips2 [county_fips2.state == 0].index)
county_fips2=county_fips2.drop(county_fips2 [county_fips2.state == 11].index)
# County_fips
index= []
for i in list(county_fips2['state']):
index.append(county_fips2[county_fips2.state == i].index.tolist()[0])
#
diffs = [y - x for x, y in zip(index, index[1:])]
diffs=diffs+[23]
# Census_Dataset
Census_Dataset[np.isnan(Census_Dataset)] = 0 #delete nan
Census_Dataset[np.isinf(Census_Dataset)] = 0 #delete inf
Censussample2=Census_Dataset.iloc[:,[0,111]]
index2= []
for(i,j) in zip(list(county_fips2['state']),diffs):
index2.append(Censussample2[Censussample2.State == i].head(j))
# Generate list of house price
df = pd.concat( [index2[0], index2[1], index2[2], index2[3],index2[4], index2[5], index2[6], index2[7],index2[8], index2[9], index2[10], index2[11],index2[12], index2[13], index2[14], index2[15],index2[16], index2[17], index2[18], index2[19],index2[20], index2[21], index2[22], index2[23],index2[24], index2[25], index2[26], index2[27],index2[28], index2[29],index2[30], index2[31], index2[32], index2[33],index2[34], index2[35], index2[36], index2[37],index2[38], index2[39],index2[40], index2[41], index2[42], index2[43],index2[44], index2[45], index2[46], index2[47], index2[48], index2[49]], axis=0 )
# Preāoperation for house price maps in USA
fipsname=list(county_fips['fips'])
HUprice2=list(df['H23.B'])
# House price in USA
fig = ff.create_choropleth(fips=fipsname, values=HUprice2, county_outline={'color': 'rgb(255,255,255)', 'width': 0.5})
fig.show()
# Show the relationship between Income, Sex and Age
plt.figure(figsize=(20, 10))
df = combine_adultclean
p1=sns.violinplot(x="Income", y="Age", hue='Sex', data=df, palette="Pastel1")
plt.ylabel("Age",fontsize=30,fontstyle="normal",fontweight='black')
plt.xlabel("Income",fontsize=30,fontstyle="normal",fontweight='black')
plt.tick_params(labelsize=20)
plt.show()
# Show the relationship between Income and Education Num
plt.figure(figsize=(20, 10))
sns.set(color_codes=True)
sns.set_style("white")
sns.violinplot( x=df["Income"], y=df["Education Num"] )
plt.ylabel("Education Num",fontsize=30,fontstyle="normal",fontweight='black')
plt.xlabel("Income",fontsize=30,fontstyle="normal",fontweight='black')
plt.tick_params(labelsize=20)
CPU_clean
import statsmodels.api as sm
CPU_cleanl=CPU_clean.drop(['usr'], axis = 1)
CPU_clean_x = sm.add_constant(CPU_cleanl.iloc[:,:])
CPU_clean_y = CPU_clean.iloc[:,[22]]
model = sm.OLS(CPU_clean_y,CPU_clean_x)
result = model.fit()
result.summary()
%%dmind markdown right
#Filename+Function
## Function
### transfer_data: change data into libsvm format
### generate_data_file: change data into svmlight format
## Filename
### Census_clean+(Move'H23.B'to the last column)=Census_clean
### Census_clean+(dataset division)=Census_clean_traind / Census_clean_testd
### Census_clean_traind/Census_clean_testd+(transfer_data)=Censustrain_x_array / Censustrain_y
### Census_clean_traind+(generate_data_file)=CPU_clean_traind_svmlight.txt
# Define the functionļ¼change type of dataset into LibSVM formatļ¼
def transfer_data(train_set,test_set):
train_x = train_set.iloc[:,:-1]
train_x_array = np.array(train_x)
train_y = np.array(train_set.iloc[:,-1])
train_y.shape = (len(train_y))
test_x = test_set.iloc[:,:-1]
test_x_array = np.array(test_x)
test_y = np.array(test_set.iloc[:,-1])
test_y.shape = (len(test_y))
return (train_x_array, train_y, test_x_array, test_y)
def generate_data_file(df, path):
txt = ''
for i in df.values:
txt += str(i[-1]) + ' '
for f, v in enumerate(i[:-1], start=1):
txt += f'{f}:{v} '
txt += '\n'
print('The number of characters written in %s:' % path)
print(open(path, 'w').write(txt))
# The new document position: C:\Users\sunjin\svmlight-loader
Census_clean.head()
# Move'H23.B'to the last column
last_col2 = Census_clean.pop(Census_clean.columns[100])
Census_clean=pd.concat( (Census_clean, last_col2.to_frame()),axis=1)
Census_clean.head()
# Divide the dataset
from sklearn.model_selection import train_test_split
#data:dataset need to divided
#random_state:define the random seed to insure generate same random numbers
#test_size:proportion of testset
Census_clean_traind, Census_clean_testd = train_test_split(Census_clean, test_size=0.2, random_state=42)
# Generate dataset for Libsvm
(Censustrain_x_array, Censustrain_y, Censustest_x_array, Censustest_y)= transfer_data(Census_clean_traind,Census_clean_testd)
# Generate dataset for Svmlight and save as ###.txt in the documentāāC:\Users\sunjin\svmlight-loader
generate_data_file(Census_clean_traind,"Census_clean_traind_svmlight.txt")
generate_data_file(Census_clean_testd, "Census_clean_testd_svmlight.txt")
Onehotc_adult_clean.head()
# Divide the adult dataset
d=30162
adult_traind = Onehotc_adult_clean.iloc[:d]
adult_testd = Onehotc_adult_clean.iloc[d:]
# Generate dataset for Libsvm
(Adulttrain_x_array, Adulttrain_y, Adulttest_x_array, Adulttest_y)= transfer_data(adult_traind,adult_testd)
# The Adultdataset for Svmlight has already been created in the group projectļ¼
# Moveāusrāto the last column
last_col = CPU_clean.pop(CPU_clean.columns[22])
CPU_clean=pd.concat( (CPU_clean, last_col.to_frame()),axis=1)
CPU_clean.head()
# Divide the dataset
from sklearn.model_selection import train_test_split
#data:dataset need to divided
#random_state:define the random seed to insure generate same random numbers
#test_size:proportion of testset
CPU_clean_traind, CPU_clean_testd = train_test_split(CPU_clean, test_size=0.2, random_state=42)
# Generate dataset for Libsvm
(CPUtrain_x_array, CPUtrain_y, CPUtest_x_array, CPUtest_y)= transfer_data(CPU_clean_traind,CPU_clean_testd)
# Generate dataset for Svmlight and save as ###.txt in the documentāāC:\Users\sunjin\svmlight-loader
generate_data_file(CPU_clean_traind,"CPU_clean_traind_svmlight.txt")
generate_data_file(CPU_clean_testd, "CPU_clean_testd_svmlight.txt")
from liblinear.liblinearutil import train, predict
m_liblinearCensus = train(Censustrain_y, Censustrain_x_array, '-s 12 -c 4') #'-s 12' regression
p_train_label, p_train_acc, p_train_val = predict(Censustrain_y, Censustrain_x_array, m_liblinearCensus)
p_test_label, p_test_acc, p_test_val = predict(Censustest_y, Censustest_x_array, m_liblinearCensus)
import time
start = time.process_time()
train(Censustrain_y, Censustrain_x_array,'-s 12 -c 4')
end = time.process_time()
print (end-start)
print('The Mean squared error of Census(train set) by using Normal Liblinear is '+str(p_train_acc[1])+' (regression)')
print('The Squared correlation coefficient of Census(train set) by using Normal Liblinear is '+str(p_train_acc[2])+' (regression)')
print('The Mean squared error of Census(test set) by using Normal Liblinear is '+str(p_test_acc[1])+' (regression)')
print('The Squared correlation coefficient of Census(test set) by using Normal Liblinear is '+str(p_test_acc[2])+' (regression)')
print('The time of Census by using Normal Liblinear is '+str(end-start)+' seconds')
from libsvm.svmutil import svm_read_problem, svm_train, svm_predict
m_libsvmLRCensus_gaussian = svm_train(Censustrain_y, Censustrain_x_array, '-s 3 -t 2 -c 4')
p_train_label, p_train_acc, p_train_val = svm_predict(Censustrain_y, Censustrain_x_array, m_libsvmLRCensus_gaussian)
p_test_label, p_test_acc, p_test_val = svm_predict(Censustest_y, Censustest_x_array, m_libsvmLRCensus_gaussian)
start = time.process_time()
svm_train(Censustrain_y, Censustrain_x_array, '-s 3 -t 2 -c 4')
end = time.process_time()
print (end-start)
print('The Mean squared error of Census(train set) by using Libsvm- Gaussian Kernel is '+str(p_train_acc[1])+' (regression)')
print('The Squared correlation coefficient of Census(train set) by using Libsvm- Gaussian Kernel is '+str(p_train_acc[2])+' (regression)')
print('The Mean squared error of Census(test set) by using Libsvm- Gaussian Kernel is '+str(p_test_acc[1])+' (regression)')
print('The Squared correlation coefficient of Census(test set) by using Libsvm- Gaussian Kernel is '+str(p_test_acc[2])+' (regression)')
print('The time of Census by using Libsvm- Gaussian Kernel is '+str(end-start)+' seconds')
# Powershell code:
## ./svm_learn.exe -z r -t 2 -c 4 Census/Census_clean_traind_svmlight.txt Census/model
## ./svm_classify.exe Census/Census_clean_traind_svmlight.txt Census/model Census/predicidons
# Read the prediction txt file save as Census_clean_traind_predict
with open("C:\\Users\\sunjin\\Desktop\\machine learning group project\\svm_light\\Census\\Census_clean_traind_predict.txt", "r") as f:
Census_clean_traind_predict = f.readlines()
# Delete the ā\nā in the list and change str into float format
Census_clean_traind_predictnew=[]
for i in range(len(Census_clean_traind_predict)):
Census_clean_traind_predictnew.append(float(Census_clean_traind_predict[i].strip('\n')))
# Compute MSE by using svmlight
mc=[Census_clean_traind_predictnew[i]-list(Census_clean_traind['H23.B'])[i] for i in range(0,len(Census_clean_traind_predictnew))]
for i in range(len(mc)):
mc[i] =mc[i]**2
CensussvmlightMSE=sum(mc)/len(Census_clean_traind_predictnew)
## Compute R_square by using svmlight(train set)
ssrc=[list(Census_clean_traind['H23.B'])[i]-Census_clean_traind_predictnew[i] for i in range(len(Census_clean_traind_predictnew))]
for i in range(len(ssrc)):
ssrc[i] =ssrc[i]**2
CensussvmlighttraindSSR=sum(ssrc)
sstc=[list(Census_clean_traind['H23.B'])[i]-np.mean(list(Census_clean_traind['H23.B'])) for i in range(len(list(Census_clean_traind['H23.B'])))]
for i in range(len(sstc)):
sstc[i] =sstc[i]**2
CensussvmlighttraindSST=sum(sstc)
CensussvmlighttraindR_square=1-(CensussvmlighttraindSSR/CensussvmlighttraindSST)
# Powershell code:
## ./svm_classify.exe Census/Census_clean_testd_svmlight.txt Census/model Census/predicidons2
del mc
del ssrc
del sstc
# Read the prediction txt file save as Census_clean_testd_predict
with open("C:\\Users\\sunjin\\Desktop\\machine learning group project\\svm_light\\Census\\Census_clean_testd_predict.txt", "r") as f:
Census_clean_testd_predict = f.readlines()
# Delete the ā\nā in the list and change str into float format
Census_clean_testd_predictnew=[]
for i in range(len(Census_clean_testd_predict)):
Census_clean_testd_predictnew.append(float(Census_clean_testd_predict[i].strip('\n')))
# Compute MSE by using svmlight(testd)
mc=[Census_clean_testd_predictnew[i]-list(Census_clean_testd['H23.B'])[i] for i in range(0,len(Census_clean_testd_predictnew))]
for i in range(len(mc)):
mc[i] =mc[i]**2
CensussvmlighttestdMSE=sum(mc)/len(Census_clean_testd_predictnew)
# Compute R_square by using svmlight(testd)
ssrc=[list(Census_clean_testd['H23.B'])[i]-Census_clean_testd_predictnew[i] for i in range(len(Census_clean_testd_predictnew))]
for i in range(len(ssrc)):
ssrc[i] =ssrc[i]**2
CensussvmlighttestdSSR=sum(ssrc)
sstc=[list(Census_clean_testd['H23.B'])[i]-np.mean(list(Census_clean_testd['H23.B'])) for i in range(len(list(Census_clean_testd['H23.B'])))]
for i in range(len(sstc)):
sstc[i] =sstc[i]**2
CensussvmlighttestdSST=sum(sstc)
CensussvmlighttestdR_square=1-(CensussvmlighttestdSSR/CensussvmlighttestdSST)
# Running time
print('The Mean squared error of Census(train set) by using Svmlight- Gaussian Kernel is '+str(CensussvmlightMSE)+' (regression)')
print('The Squared correlation coefficient of Census(train set) by using Svmlight- Gaussian Kernel is '+str(CensussvmlighttraindR_square)+' (regression)')
print('The Mean squared error of Census(test set) by using Svmlight- Gaussian Kernel is '+str(CensussvmlighttestdMSE)+' (regression)')
print('The Squared correlation coefficient of Census(test set) by using Svmlight- Gaussian Kernel is '+str(CensussvmlighttestdR_square)+' (regression)')
print('The time of Census by using Svmlight- Gaussian Kernel is '+str(29.16)+' seconds')
from liblinear.liblinearutil import train, predict
m_liblinearAdult = train(Adulttrain_y, Adulttrain_x_array, '-s 0 -c 4') #'-s 12' regression
p_train_label, p_train_acc, p_train_val = predict(Adulttrain_y, Adulttrain_x_array, m_liblinearAdult)
p_test_label, p_test_acc, p_test_val = predict(Adulttest_y, Adulttest_x_array, m_liblinearAdult)
import time
start = time.process_time()
train(Adulttrain_y, Adulttrain_x_array, '-s 0 -c 4')
end = time.process_time()
print (end-start)
print('The Accuracy of Adult(train set) by using Normal Liblinear is '+str(p_train_acc[0])+' (classification)')
print('The Accuracy of Adult(test set) by using Normal Liblinear is '+str(p_test_acc[0])+' (classification)')
print('The time of Adult by using Normal Liblinear is '+str(end-start)+' seconds')
# LibSVM - Gaussian Kernel
from libsvm.svmutil import svm_read_problem, svm_train, svm_predict
m_libsvmAdult_gaussian = svm_train(Adulttrain_y, Adulttrain_x_array, '-s 0 -t 2 -c 4')
p_train_label, p_train_acc, p_train_val = svm_predict(Adulttrain_y, Adulttrain_x_array, m_libsvmAdult_gaussian)
p_test_label, p_test_acc, p_test_val = svm_predict(Adulttest_y, Adulttest_x_array, m_libsvmAdult_gaussian)
import time
start = time.process_time()
svm_train(Adulttrain_y, Adulttrain_x_array, '-s 0 -t 2 -c 4')
end = time.process_time()
print (end-start)
print('The Accuracy of Adult(train set) by using LibSVM - Gaussian Kernel is '+str(p_train_acc[0])+' (classification)')
print('The Accuracy of Adult(test set) by using LibSVM - Gaussian Kernel is '+str(p_test_acc[0])+' (classification)')
print('The time of Adult by using LibSVM - Gaussian Kernel is '+str(end-start)+' seconds')
# powershell
## ./svm_learn.exe -t 2 adultdata/adulttrain_data adultdata/model
## ./svm_classify.exe adultdata/adulttrain_data adultdata/model adultdata/predicidons
## ./svm_classify.exe adultdata/adulttest_data adultdata/model adultdata/predicidons2
from liblinear.liblinearutil import train, predict
m_liblinearCPU = train(CPUtrain_y, CPUtrain_x_array, '-s 12 -c 4') #'-s 12' regression
p_train_label, p_train_acc, p_train_val = predict(CPUtrain_y, CPUtrain_x_array, m_liblinearCPU)
p_test_label, p_test_acc, p_test_val = predict(CPUtest_y, CPUtest_x_array, m_liblinearCPU)
import time
start = time.process_time()
train(CPUtrain_y, CPUtrain_x_array,'-s 12 -c 4')
end = time.process_time()
print (end-start)
print('The Mean squared error of CPU(train set) by using Normal liblinear is '+str(p_train_acc[1])+' (regression)')
print('The Squared correlation coefficient of CPU(train set) by using Normal liblinear is '+str(p_train_acc[2])+' (regression)')
print('The Mean squared error of CPU(test set) by using Normal liblinear is '+str(p_test_acc[1])+' (regression)')
print('The Squared correlation coefficient of CPU(test set) by using Normal liblinear is '+str(p_test_acc[2])+' (regression)')
print('The time of CPU by using Normal liblinear is '+str(end-start)+' seconds')
# CPU
from libsvm.svmutil import svm_read_problem, svm_train, svm_predict
m_libsvmLRCPU_gaussian = svm_train(CPUtrain_y, CPUtrain_x_array, '-s 3 -t 2 -c 4')
p_train_label, p_train_acc, p_train_val = svm_predict(CPUtrain_y, CPUtrain_x_array, m_libsvmLRCPU_gaussian)
p_test_label, p_test_acc, p_test_val = svm_predict(CPUtest_y, CPUtest_x_array, m_libsvmLRCPU_gaussian)
start = time.process_time()
svm_train(CPUtrain_y, CPUtrain_x_array, '-s 3 -t 2 -c 4')
end = time.process_time()
print (end-start)
print('The Mean squared error of CPU(train set) by using LibSVM - Gaussian Kernel is '+str(p_train_acc[1])+' (regression)')
print('The Squared correlation coefficient of CPU(train set) by using LibSVM - Gaussian Kernel is '+str(p_train_acc[2])+' (regression)')
print('The Mean squared error of CPU(test set) by usingLibSVM - Gaussian Kernel is '+str(p_test_acc[1])+' (regression)')
print('The Squared correlation coefficient of CPU(test set) by using LibSVM - Gaussian Kernel is '+str(p_test_acc[2])+' (regression)')
print('The time of CPU by using LibSVM - Gaussian Kernel is '+str(end-start)+' seconds')
# powershell
## ./svm_learn.exe -z r -t 2 -c 4 CPUdata/CPU_clean_traind_svmlight.txt CPUdata/model
## ./svm_classify.exe CPUdata/CPU_clean_traind_svmlight.txt CPUdata/model CPUdata/predicidons
## ./svm_classify.exe CPUdata/CPU_clean_testd_svmlight.txt CPUdata/model CPUdata/predicidons2
del mc
del ssrc
del sstc
# Read the prediction txt file save as CPU_clean_traind_predict
with open("C:\\Users\\sunjin\\Desktop\\machine learning group project\\svm_light\\CPUdata\\CPU_clean_train_predict.txt", "r") as f:
CPU_clean_traind_predict = f.readlines()
# Delete the ā\nā in the list and change str into float format
CPU_clean_traind_predictnew=[]
for i in range(len(CPU_clean_traind_predict)):
CPU_clean_traind_predictnew.append(float(CPU_clean_traind_predict[i].strip('\n')))
# Compute MSE by using svmlight
mc=[CPU_clean_traind_predictnew[i]-list(CPU_clean_traind['usr'])[i] for i in range(0,len(CPU_clean_traind_predictnew))]
for i in range(len(mc)):
mc[i] =mc[i]**2
CPUsvmlighttraindMSE=sum(mc)/len(CPU_clean_traind_predictnew)
# Compute R_square by using svmlight(train set)
ssrc=[CPU_clean_traind_predictnew[i]-np.mean(list(CPU_clean_traind['usr'])) for i in range(0,len(CPU_clean_traind_predictnew))]
for i in range(len(ssrc)):
ssrc[i] =ssrc[i]**2
CPUsvmlighttraindSSR=sum(ssrc)
sstc=[list(CPU_clean_traind['usr'])[i]-np.mean(list(CPU_clean_traind['usr'])) for i in range(0,len(list(CPU_clean_traind['usr'])))]
for i in range(len(sstc)):
sstc[i] =sstc[i]**2
CPUsvmlighttraindSST=sum(sstc)
CPUsvmlighttraindR_square=CPUsvmlighttraindSSR/CPUsvmlighttraindSST
del mc
del ssrc
del sstc
# Read the prediction txt file save as CPU_clean_testd_predict
with open("C:\\Users\\sunjin\\Desktop\\machine learning group project\\svm_light\\CPUdata\\CPU_clean_testd_predict.txt", "r") as f:
CPU_clean_testd_predict = f.readlines()
# Delete the ā\nā in the list and change str into float format
CPU_clean_testd_predictnew=[]
for i in range(len(CPU_clean_testd_predict)):
CPU_clean_testd_predictnew.append(float(CPU_clean_testd_predict[i].strip('\n')))
# Compute MSE by using svmlight
mc=[CPU_clean_testd_predictnew[i]-list(CPU_clean_testd['usr'])[i] for i in range(0,len(CPU_clean_testd_predictnew))]
for i in range(len(mc)):
mc[i] =mc[i]**2
CPUsvmlighttestdMSE=sum(mc)/len(CPU_clean_testd_predictnew)
# Compute R_square by using svmlight(test set)
ssrc=[CPU_clean_testd_predictnew[i]-np.mean(list(CPU_clean_testd['usr'])) for i in range(0,len(CPU_clean_testd_predictnew))]
for i in range(len(ssrc)):
ssrc[i] =ssrc[i]**2
CPUsvmlighttestdSSR=sum(ssrc)
sstc=[list(CPU_clean_testd['usr'])[i]-np.mean(list(CPU_clean_testd['usr'])) for i in range(0,len(list(CPU_clean_testd['usr'])))]
for i in range(len(sstc)):
sstc[i] =sstc[i]**2
CPUsvmlighttestdSST=sum(sstc)
CPUsvmlighttestdR_square=CPUsvmlighttestdSSR/CPUsvmlighttestdSST
# Running time
print('The Mean squared error of CPU(train set) by using Svmlight- Gaussian Kernel is '+str(CPUsvmlighttraindMSE)+' (regression)')
print('The Squared correlation coefficient of CPU(train set) by using Svmlight- Gaussian Kernel is '+str(CPUsvmlighttraindR_square)+' (regression)')
print('The Mean squared error of CPU(test set) by using Svmlight- Gaussian Kernel is '+str(CPUsvmlighttestdMSE)+' (regression)')
print('The Squared correlation coefficient of CPU(test set) by using Svmlight- Gaussian Kernel is '+str(CPUsvmlighttestdR_square)+' (regression)')
print('The time of CPU by using Svmlight- Gaussian Kernel is '+str(0.24)+' seconds')
# Define the RFF function
data = '../data/'
from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError
from scipy.stats import cauchy, laplace
from sklearn.metrics.pairwise import rbf_kernel, laplacian_kernel
class RFF(BaseEstimator):
def __init__(self, gamma = 0.01, D = 500, metric = "rbf"):
self.gamma = gamma
self.metric = metric
self.D = D
self.fitted = False
def fit(self, X):
""" Generates MonteCarlo random samples """
d = X.shape[1]
if self.metric == "rbf":
self.w = np.sqrt(2*self.gamma)*np.random.normal(size=(self.D,d))
self.u = 2*np.pi*np.random.rand(self.D)
self.fitted = True
return self
def transform(self,X):
""" Transforms the data X (n_samples, n_features) to the new map space Z(X) (n_samples, n_components)"""
if not self.fitted:
raise NotFittedError("RBF_MonteCarlo must be fitted beform computing the feature map Z")
Z = np.sqrt(2/self.D)*np.cos((X.dot(self.w.T) + self.u[np.newaxis,:]))
print(Z.shape)
return Z
# Census
# Delete the Label and generate features dataset
Census_clean_traind_x=Census_clean_traind.iloc[:,0:-1]
Census_clean_testd_x=Census_clean_testd.iloc[:,0:-1]
# Generate rff feature dataset
rff = RFF()
rff.fit(Census_clean_traind_x)
Census_clean_traind_x_rff = rff.transform(Census_clean_traind_x)
Census_clean_testd_x_rff = rff.transform(Census_clean_testd_x)
# Change the rff feature dataset into array format
Census_clean_traind_x_rff2 = Census_clean_traind_x_rff.copy()
Census_clean_traind_x_rff_array = np.array(Census_clean_traind_x_rff2)
Census_clean_testd_x_rff2 = Census_clean_testd_x_rff.copy()
Census_clean_testd_x_rff_array = np.array(Census_clean_testd_x_rff2)
from liblinear.liblinearutil import train, predict
m_liblinearCensus = train(Censustrain_y, Census_clean_traind_x_rff_array, '-s 12 -c 4') #'-s 12' regression
p_train_label, p_train_acc, p_train_val = predict(Censustrain_y, Census_clean_traind_x_rff_array, m_liblinearCensus)
p_test_label, p_test_acc, p_test_val = predict(Censustest_y, Census_clean_testd_x_rff_array, m_liblinearCensus)
import time
start = time.process_time()
train(Censustrain_y, Census_clean_traind_x_rff_array,'-s 12 -c 4')
end = time.process_time()
print (end-start)
# Adult
# Delete the Label and generate features dataset
Adult_clean_traind_x=adult_traind.iloc[:,0:-1]
Adult_clean_testd_x=adult_testd.iloc[:,0:-1]
# Generate rff feature dataset
rff = RFF()
rff.fit(Adult_clean_traind_x)
Adult_clean_traind_x_rff = rff.transform(Adult_clean_traind_x)
Adult_clean_testd_x_rff = rff.transform(Adult_clean_testd_x)
# Change the rff feature dataset into array format
Adult_clean_traind_x_rff2 = Adult_clean_traind_x_rff.copy()
Adult_clean_traind_x_rff_array = np.array(Adult_clean_traind_x_rff2)
Adult_clean_testd_x_rff2 = Adult_clean_testd_x_rff.copy()
Adult_clean_testd_x_rff_array = np.array(Adult_clean_testd_x_rff2)
from liblinear.liblinearutil import train, predict
m_liblinearAdult = train(Adulttrain_y, Adult_clean_traind_x_rff_array, '-s 0 -c 4') #'-s 12' regression
p_train_label, p_train_acc, p_train_val = predict(Adulttrain_y, Adult_clean_traind_x_rff_array, m_liblinearAdult)
p_test_label, p_test_acc, p_test_val = predict(Adulttest_y,Adult_clean_testd_x_rff_array, m_liblinearAdult)
import time
start = time.process_time()
train(Adulttrain_y, Adult_clean_traind_x_rff_array, '-s 0 -c 4')
end = time.process_time()
print (end-start)
# CPU
# Delete the Label and generate features dataset
CPU_clean_traind_x=CPU_clean_traind.iloc[:,0:-1]
CPU_clean_testd_x=CPU_clean_testd.iloc[:,0:-1]
# Generate rff feature dataset
rff = RFF()
rff.fit(CPU_clean_traind_x)
CPU_clean_traind_x_rff = rff.transform(CPU_clean_traind_x)
CPU_clean_testd_x_rff = rff.transform(CPU_clean_testd_x)
# Change the rff feature dataset into array format
CPU_clean_traind_x_rff2 = CPU_clean_traind_x_rff.copy()
CPU_clean_traind_x_rff_array = np.array(CPU_clean_traind_x_rff2)
CPU_clean_testd_x_rff2 = CPU_clean_testd_x_rff.copy()
CPU_clean_testd_x_rff_array = np.array(CPU_clean_testd_x_rff2)
from liblinear.liblinearutil import train, predict
m_liblinearCPU = train(CPUtrain_y, CPU_clean_traind_x_rff_array, '-s 12 -c 4') #'-s 12' regression
p_train_label, p_train_acc, p_train_val = predict(CPUtrain_y, CPU_clean_traind_x_rff_array, m_liblinearCPU)
p_test_label, p_test_acc, p_test_val = predict(CPUtest_y, CPU_clean_testd_x_rff_array, m_liblinearCPU)
import time
start = time.process_time()
train(CPUtrain_y, CPU_clean_traind_x_rff_array,'-s 12 -c 4')
end = time.process_time()
print (end-start)
end0 = time.process_time()
print (end0-start0)
Census_clean=Census_Dataset.drop(['State','Code','H23.A','H23.C','H24','P4.1','P4.2','P4.3','P4.4','H4.1','H4.2','H4.3','H4.4','H35.1'], axis = 1) Census_cleanZ_score = (Census_clean - Census_clean.mean()) / (Census_clean.std())
Onehotc_adult_clean=pd.get_dummies(combineadultclean,dtype='int8').drop(['Income<=50K'], axis = 1) Onehotc_adultclean['Income>50K']=Onehotc_adultclean['Income>50K'].replace(0, -1) Onehotc_adult_cleanZ_score = (Onehotc_adult_clean- Onehotc_adult_clean.mean()) / (Onehotc_adult_clean.std())
CPU_clean=CPU.drop(['time'], axis = 1) CPU_cleanZ_score = (CPU_clean - CPU_clean.mean()) / (CPU_clean.std())